### setup the working env using Merck proxy
import os
os.environ['http_proxy'] = "http://webproxy.merck.com:8080"
os.environ['https_proxy'] = "http://webproxy.merck.com:8080"
import pandas as pd
import numpy as np
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import psycopg2
### For blank plot issue
import plotly.offline as pyo
import plotly.graph_objs as go
# Set notebook mode to work in offline
pyo.init_notebook_mode()
from pycaret.utils import enable_colab
enable_colab()
## To display the whole dataframe
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
df1= pd.read_csv('DIRECT_INPUT_BOT_EMAIL.csv')
print(df1.nunique())
print(df1.columns)
print(df1.info())
print(df1.shape)
df1.isnull().sum()
#df1.describe()
#del df1['BOT_ACTIVITY']
duplicate = df1[df1.duplicated()]
print(duplicate)
# Non_Promotional_click
df1['BOT_ACTIVITY'].value_counts()
# NA label
df1['BOT_ACTIVITY'].isnull().sum()
# German all bot activities (Netherland data) is 3.7% or ~96% human vs 40% Thrung's
100*(4690+ 1925)/176000
# Honeypot-based bot activity is 0.5% vs 3.7% (Netherland/German)
100*(449+ 470)/176000
df2= pd.read_csv('DIRECT_INPUT_SFMC_F_ME_ACTION.csv')
print(df2.nunique())
print(df2.columns)
print(df2.info())
print(df2.shape)
df2.isnull().sum()
# import os
# cwd = os.getcwd() # Get the current working directory (cwd)
# files = os.listdir(cwd) # Get all the files in that directory
# print("Files in %r: %s" % (cwd, files))
df1.head(3)
df2.head(3)
# ClickId_ME: Click ID from ME, can be linked to EIS_ID in DIRECT_INPUT_SFMC_F_ME_ACTION (df2)
# SentId: Sent Id of ME, can be linked to ‘%User Activity Key’ in DIRECT_INPUT_SFMC_F_ME_ACTION
#print(df1.columns)
#print(df2.columns)
print(list(df1))
print(list(df2))
# remove NAs
### First Join
joined_table1 = pd.merge(df1, df2, how = 'inner', left_on= 'ClickId_ME', right_on= 'EIS_ID')
joined_table1.shape
joined_table1['BOT_ACTIVITY'].value_counts()
print(joined_table1.nunique())
print(joined_table1.columns)
print(joined_table1.info())
print(joined_table1.shape)
joined_table1.isnull().sum()
# #joined_table1[joined_table1['ME Click URL'].notna()]
# joined_table1['ME Click URL'].dropna()
joined_table1_test = pd.merge(df1, df2, how = 'inner', on = ['EIS_ID', 'EIS_ID'])
joined_table1_test
## Remove a1t.... from SentId column
df1_updated=df1[~df1['SentId'].str.startswith('a1t')]
print(df1_updated.head(3))
print(df1_updated.shape)
## Remove the second part of sentId
df1_updated['SentId_updated']=df1_updated['SentId'].str.split('|').apply(lambda x: '|'.join([x[0], x[2], x[3]]))
#### Second Join
joined_table2 = pd.merge(df1_updated, df2, how = 'inner', left_on= 'SentId_updated', right_on= '%User Activity Key')
joined_table2.shape
joined_table2.head(3)
print(joined_table2.nunique())
print(joined_table2.columns)
print(joined_table2.info())
print(joined_table2.shape)
joined_table2.isnull().sum()
joined_table2['BOT_ACTIVITY'].value_counts()
joined_table1.head()
joined_table2['BOT_ACTIVITY'].value_counts()
joined_table2['%Journey Activity Key'].value_counts()
joined_table2.head(3)
pwd
PingPU=joined_table1.groupby(['ME Action Email']).size().sort_values(ascending=False)
PingPU.head(30)
PingPU=joined_table1.groupby(['ME Event Time']).size().sort_values(ascending=False)
PingPU.head(10)
PingPU=joined_table1.groupby(['ME Event Date']).size().sort_values(ascending=False)
PingPU.head(10)
joined_table1['ME Event Date'] = pd.to_datetime(joined_table1['ME Event Date'])
joined_table1['ME Event Time'] = pd.to_datetime(joined_table1['ME Event Time'])
joined_table1['%Calendar Key'] = pd.to_datetime(joined_table1['%Calendar Key'])
print(joined_table1.info())
#type(joined_table1['ME Event Date'])
joined_table1.groupby(pd.Grouper(key='ME Event Time',freq='M')).size().sort_values(ascending=False)#.nunique()ascending=True
joined_table1.groupby(pd.Grouper(key='ME Event Date',freq='M')).size().sort_values(ascending=False)
joined_table1.groupby(pd.Grouper(key='%Calendar Key',freq='M')).size().sort_values(ascending=False)
# # Useless columns
# EMAIL_TYPE, ClickId_AE, ClickId_ME, SentId, EIS_ID_x, NON_BOT, SentId_updated, %Country Mailing Key, %Email Activity Key, %User Activity Key, %Country Key, %Mailing Key, %MDM Key
# %Campaign Key, Subscriber_List ID
# Triggered Email Key
# ME Bounce Reason, ME Unsubscribe Reason, EIS_ID_y
Final_joined_table1 = joined_table1.drop(['EMAIL_TYPE', 'ClickId_AE', 'ClickId_ME', 'EIS_ID_x', 'NON_BOT', '%Country Mailing Key', '%User Activity Key', '%Country Key', '%Mailing Key', '%MDM Key',
'%Campaign Key', 'Subscriber_List ID',
'Triggered Email Key',
'ME Bounce Reason', 'ME Unsubscribe Reason', 'EIS_ID_y', '%Journey Activity Key', 'ME Bounce Category'], axis='columns')# 'SentId', '%Email Activity Key'
Final_joined_table1.head(3)
print(Final_joined_table1.nunique())
print(Final_joined_table1.columns)
print(Final_joined_table1.info())
print(Final_joined_table1.shape)
Final_joined_table1.isnull().sum()
Final_joined_table1_NA = Final_joined_table1.dropna(subset= ['ME Browser', 'ME Email Client','ME OperatingSystem', 'ME Device'])
print(Final_joined_table1_NA.nunique())
print(Final_joined_table1_NA.columns)
print(Final_joined_table1_NA.info())
print(Final_joined_table1_NA.shape)
print(Final_joined_table1_NA.isnull().sum())
print(Final_joined_table1_NA['BOT_ACTIVITY'].value_counts())
85115-68215
Final_joined_table1_NA['ME Event Time dif'] = Final_joined_table1_NA['ME Event Date'] - Final_joined_table1_NA['ME Event Time']
Final_joined_table1_NA['ME Event Time dif'] = Final_joined_table1_NA['ME Event Time dif']/np.timedelta64(1,'s')
Final_joined_table1_NA.head()
Final_joined_table1_NA['ME Event Time dif'].describe()
Final_joined_table1_NA = Final_joined_table1_NA.drop(columns= ['ME Event Time dif', 'ME Event Time', '%Calendar Key'])
Final_joined_table1_NA.head(3)
print(Final_joined_table1_NA['BOT_ACTIVITY'].value_counts())
print(Final_joined_table1_NA['ME Event Type'].value_counts())
print(Final_joined_table1_NA.isnull().sum())
## Detect duplicate for honeypot around
Final_joined_table1_NA_test = Final_joined_table1_NA.drop(['BOT_ACTIVITY'], axis='columns')
duplicate1 = Final_joined_table1_NA_test[Final_joined_table1_NA_test.duplicated()]
#print(duplicate1)
duplicate1.shape
Final_joined_table1_NA.shape
duplicate1 = Final_joined_table1_NA[Final_joined_table1_NA.duplicated()]
duplicate1.head(5)
duplicate1.shape
Final_joined_table1_NA_honey = Final_joined_table1_NA.loc[Final_joined_table1_NA['BOT_ACTIVITY'].astype(str).isin(['Bot_click_Honeypot', 'Bot_click_Honeypot_around'])]
Final_joined_table1_NA_honey.head(3)
#Final_joined_table1_NA_honey.shape
duplicate = Final_joined_table1_NA_honey[Final_joined_table1_NA_honey.duplicated()]
duplicate.shape
Final_joined_table1_NA_honey_test = Final_joined_table1_NA_honey.drop(['BOT_ACTIVITY'], axis='columns')
duplicate = Final_joined_table1_NA_honey_test[Final_joined_table1_NA_honey_test.duplicated()]
duplicate.shape
Final_joined_table1_NA_honey_test.head()
#Groupby time and link
Final_joined_table1_NA = Final_joined_table1_NA.drop(['ME Send URL','ME URL ID','ME Click URL','ME Alias','ME_Is_Unique_for_URL'], axis='columns')
Final_joined_table1_NA.head(3)
print(Final_joined_table1_NA.isnull().sum())
Final_joined_table1_NA_test= Final_joined_table1_NA[['ME Event Date', 'BOT_ACTIVITY'
]]#'ME Email Client',
# 'ME OperatingSystem',
# 'ME Device'
Final_joined_table1['ME Event Type'].value_counts()
#Just Open and Sent have reords for "categorical" vars including "isunique"
68215 + 16900 #+ 4363 + 45
Final_joined_table1_Sent = Final_joined_table1.loc[Final_joined_table1['ME Event Type'].astype(str) == 'Sent'] #from 176k
Final_joined_table1_Open = Final_joined_table1_NA.loc[Final_joined_table1_NA['ME Event Type'].astype(str) == 'Open'] #from 85k no NAs no Sent
Final_joined_table1_Click = Final_joined_table1_NA.loc[Final_joined_table1_NA['ME Event Type'].astype(str) == 'Click'] #from 85k no NAs no Sent
print(Final_joined_table1_Sent.shape)
print(Final_joined_table1_Open.shape)
print(Final_joined_table1_Click.shape)
print(Final_joined_table1_Sent.isnull().sum())
print(Final_joined_table1_Open.isnull().sum())
print(Final_joined_table1_Click.isnull().sum())
print(Final_joined_table1_Sent.nunique()) # unique SentId 87258 no NAs
print(Final_joined_table1_Open.nunique()) # SentId 33374 no NAs
print(Final_joined_table1_Click.nunique()) # SentId 5621 no NAs
print(Final_joined_table1_Sent.shape) # (87353, 18)
print(Final_joined_table1_Open.shape) #(68215, 11)
print(Final_joined_table1_Click.shape) #(16900, 11)
Final_joined_table1.shape
Final_joined_table1_NA.shape
Final_joined_table1_Click.head(3)
Final_joined_table1_NA.head(3)
Final_joined_table1_Sent.isnull().sum()
joined_table_Open_Click = pd.merge(Final_joined_table1_Click, Final_joined_table1_Open, how = 'inner', on= ['SentId'])# , '%Email Activity Key'
joined_table_Open_Click.shape
joined_table_Sent_Click = pd.merge(Final_joined_table1_Sent, Final_joined_table1_Click, how = 'left', on= ['SentId'])# , '%Email Activity Key'
joined_table_Sent_Click.shape
joined_table_Sent_Click_Open = pd.merge(joined_table_Sent_Click, Final_joined_table1_Open, how = 'left', on= ['SentId'])# , '%Email Activity Key'
joined_table_Sent_Click_Open.shape
joined_table_Sent_Click_Open.head(5)
# joined_table_Sent_Click_Open.dropna(subset= ['BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'BOT_ACTIVITY'])
joined_table_Sent_Click_Open.shape
joined_table_Open_Click_Sent = pd.merge(joined_table_Open_Click, Final_joined_table1_Sent, how = 'inner', on= ['SentId'])# , '%Email Activity Key'
joined_table_Open_Click_Sent.shape
joined_table_Open_Click_Sent.head(4)
joined_table_Open_Click_Sent.columns
#### Keep the relevant columns
Final_joined_table_Open_Click_Sent= joined_table_Open_Click_Sent[['SentId', 'BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'BOT_ACTIVITY',
'ME Action Email_x', 'ME Event Date_x', 'ME Event Type_x',
'ME Event Date_y', 'ME Event Type_y',
'ME Event Date', 'ME Event Type', 'ME_Is_Unique_x', 'ME Browser_x', 'ME Email Client_x',
'ME OperatingSystem_x', 'ME Device_x']] #SentId BOT_ACTIVITY_x, ME Action Email, ME Event Date_x ME Event Type_x, ME_Is_Unique_x, ME Browser_x ME Email Client_x ME OperatingSystem_x ME Device_x BOT_ACTIVITY_y ME Event Date_y ME Event Type_y BOT_ACTIVITY, ME Event Date ME Event Type
Final_joined_table_Open_Click_Sent.head(10)
print(Final_joined_table_Open_Click_Sent.nunique())
print(Final_joined_table_Open_Click_Sent.columns)
print(Final_joined_table_Open_Click_Sent.info())
print(Final_joined_table_Open_Click_Sent.shape)
print(Final_joined_table_Open_Click_Sent.isnull().sum())
print(Final_joined_table_Open_Click_Sent['BOT_ACTIVITY'].value_counts())
print(Final_joined_table_Open_Click_Sent['BOT_ACTIVITY_x'].value_counts())
print(Final_joined_table_Open_Click_Sent['BOT_ACTIVITY_y'].value_counts())
# print(Final_joined_table1_NA.isnull().sum())
# Final_joined_table_Open_Click_Sent= joined_table_Open_Click_Sent.drop(['%Email Activity Key_x', 'ME Send URL','ME URL ID','ME Click URL','ME Alias','ME_Is_Unique_for_URL', 'ME_Is_Unique', 'ME Browser', '%Calendar Key', 'ME Event Time','ME Action Email','%Email Activity Key', 'ME Email Client', 'ME OperatingSystem', 'ME Device', ], axis='columns')
# joined_table_sent_open_click = pd.merge(Final_joined_table1_NA, Final_joined_table1_sent, how = 'inner', on= ['SentId', '%Email Activity Key'])# , '%Email Activity Key'
# joined_table_sent_open_click = pd.merge(Final_joined_table1, Final_joined_table1_sent, how = 'selfjoin', on= ['BOT_ACTIVITY', 'ME Action Email', 'ME Event Date',
# 'ME_Is_Unique', 'ME Browser', 'ME Email Client', 'ME OperatingSystem',
# 'ME Device'])
# joined_table_sent_open_click.shape
# joined_table_sent_open_click.head(3)
# print(joined_table_sent_open_click.isnull().sum())
# joined_table_sent_open_click_NA = joined_table_sent_open_click.dropna()
# print(joined_table_sent_open_click_NA.isnull().sum())
joined_table_sent_open_click_NA.shape
# change values to datetime, create new columns for dif and change to seconds
# df['click_date'] = pd.to_datetime(df['click_date'])
# df['open_date'] = pd.to_datetime(df['open_date'])
# df['sent_date'] = pd.to_datetime(df['sent_date'])
Final_joined_table_Open_Click_Sent['open_to_click'] = Final_joined_table_Open_Click_Sent['ME Event Date_x'] - Final_joined_table_Open_Click_Sent['ME Event Date_y']
Final_joined_table_Open_Click_Sent['sent_to_click'] = Final_joined_table_Open_Click_Sent['ME Event Date_x'] - Final_joined_table_Open_Click_Sent['ME Event Date']
Final_joined_table_Open_Click_Sent['sent_to_open'] = Final_joined_table_Open_Click_Sent['ME Event Date_y'] - Final_joined_table_Open_Click_Sent['ME Event Date']
Final_joined_table_Open_Click_Sent['open_to_click'] = Final_joined_table_Open_Click_Sent['open_to_click']/np.timedelta64(1,'s')
Final_joined_table_Open_Click_Sent['sent_to_click'] = Final_joined_table_Open_Click_Sent['sent_to_click']/np.timedelta64(1,'s')
Final_joined_table_Open_Click_Sent['sent_to_open'] = Final_joined_table_Open_Click_Sent['sent_to_open']/np.timedelta64(1,'s')
Final_joined_table_Open_Click_Sent.head(2)
Final_joined_table_Open_Click_Sent_HP = Final_joined_table_Open_Click_Sent.loc[Final_joined_table_Open_Click_Sent['ME Event Date_x'].astype(str) > '2021-03-31 13:00:56']
Final_joined_table_Open_Click_Sent_HP.shape
print(Final_joined_table_Open_Click_Sent_HP['BOT_ACTIVITY_x'].value_counts())
100*(36898+24520)/215824
plt.figure(figsize=(8,6))
plt.hist(Final_joined_table_Open_Click_Sent['sent_to_open'], bins=30, alpha=0.55, label="sent_to_open", color='purple')
plt.hist(Final_joined_table_Open_Click_Sent['sent_to_click'], bins=90, alpha=0.99, label="sent_to_click", color='black')
plt.hist(Final_joined_table_Open_Click_Sent['open_to_click'], bins=90, alpha=0.5, label="open_to_click", color='yellow')
plt.xlabel("time", size=14)
plt.ylabel("Count", size=14)
plt.title("Open vs click")
plt.legend(loc='upper right')
plt.savefig("overlapping_histograms_with_matplotlib_Python_2.png")
#scatterplot
sns.set()
cols = ['BOT_ACTIVITY', 'ME Event Date_x',
'ME Event Date_y', 'ME Event Date',
'ME Browser_x', 'ME Email Client_x',
'ME OperatingSystem_x', 'ME Device_x', 'open_to_click', 'sent_to_click',
'sent_to_open']#'BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'ME_Is_Unique_x'
sns.pairplot(Final_joined_table_Open_Click_Sent[cols], size = 2.5)
plt.show();
cols
# # Importing module and initializing setup
# from pycaret.anomaly import *
# ano1 = setup(data = Final_joined_table1_NA_test)
# # creating a model
# iforest = create_model('iforest')
# # plotting a model
# plot_model(iforest)
# The HP link can be found in the URL that contains “https://www.msd.nl/?utm_term=bot_activity”
df['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df = pd.read_csv('Netherland_email_for_anomaly_detection_since_March_31_2021_with_23534_37.csv')
print(df.nunique())
print(df.columns)
print(df.info())
print(df.shape)
df.isnull().sum()
df.head()
df[['ME Action Email']].eq('w.hermans@elisabeth.nl').sum()
df_sub = df[['SentId','ME Action Email', 'ME Event Type_FROM_SENT',
'ME Event Type_FROM_CLICK_OR_OPEN', 'ME Event Date_FROM_SENT',
'ME Event Date_FROM_CLICK_OR_OPEN',
'BOT_ACTIVITY_FROM_CLICK_OR_OPEN', 'ME_Is_Unique_FROM_CLICK_OR_OPEN',
'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN']]
df_sub.head(3)
# df_sub_sorted= df_sub.groupby(['ME Action Email', 'SentId']).apply(lambda x: x.sort_values('ME Event Date_FROM_CLICK_OR_OPEN'))
# df_sub_sorted.head(70)
df_sub_sorted = df_sub.sort_values(by=['ME Action Email', 'SentId', 'ME Event Date_FROM_CLICK_OR_OPEN'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
df_sub_sorted.head(3)
df_sub_sorted.columns
# df_sub_sorted =df_sub.groupby(['ME Action Email', 'SentId', 'ME Event Date_FROM_CLICK_OR_OPEN']).size().reset_index(name='count')
# #.size().sort_values(ascending=False)
# df_sub_sorted.head(30)
df_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
print(df_sub.nunique())
# print(df.columns)
# print(df.info())
# print(df.shape)
df_sub.isnull().sum()
# change values to datetime, create new columns for dif and change to seconds
df_sub['ME Event Date_FROM_SENT'] = pd.to_datetime(df_sub['ME Event Date_FROM_SENT'])
df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] = pd.to_datetime(df_sub['ME Event Date_FROM_CLICK_OR_OPEN'])
# df_sub['open_to_click'] = df_sub['ME Event Date_x'] - df_sub['ME Event Date_y']
# df_sub['sent_to_click'] = df_sub['ME Event Date_x'] - df_sub['ME Event Date_FROM_SENT']
df_sub['Sent_to_OpenClick'] = df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] - df_sub['ME Event Date_FROM_SENT']
df_sub['Sent_to_OpenClick'] = df_sub['Sent_to_OpenClick']/np.timedelta64(1,'s')
# df_sub['sent_to_click'] = df_sub['sent_to_click']/np.timedelta64(1,'s')
# df_sub['sent_to_open'] = df_sub['sent_to_open']/np.timedelta64(1,'s')
df_sub.head()
plt.figure(figsize=(8,6))
plt.hist(df_sub['Sent_to_OpenClick'], bins=30, alpha=0.55, label="sent_to_OpenClick", color='purple')
# plt.hist(Final_joined_table_Open_Click_Sent['sent_to_click'], bins=90, alpha=0.99, label="sent_to_click", color='black')
# plt.hist(Final_joined_table_Open_Click_Sent['open_to_click'], bins=90, alpha=0.5, label="open_to_click", color='yellow')
plt.xlabel("time", size=14)
plt.ylabel("Count", size=14)
plt.title("Sent to Open/Click")
plt.legend(loc='upper right')
plt.savefig("overlapping_histograms_with_matplotlib_Python_2.png")
df_sub_Open = df_sub.loc[df_sub['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Open']
df_sub_Click = df_sub.loc[df_sub['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Click']
print('num of Clicks', df_sub_Click.shape)
print('num of Opens', df_sub_Open.shape)
print('Sents', df_sub.shape)
17353*6181
Joining_Open_and_Click_3features = pd.merge(df_sub_Open, df_sub_Click, how = 'inner', on= ['ME Action Email', 'SentId'])
Joining_Open_and_Click_3features.shape
Joining_Open_and_Click_3features.head(30)
df_sub.columns
df_sub_sub.head
df_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
### Just keep honeypot labels - O, 1 ###
df_sub_sub1 = df_sub[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN'
]]#,'Sent_to_OpenClick', 'refrence_time_to_OpenClick'
df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 0)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',1)
######### Pycaret ###########
#############################
# Importing module and initializing setup
from pycaret.anomaly import *
ano1 = setup(data = df_sub_sub1, preprocess=True, normalize= True)#, fraction = 0.05
# creating a model
iforest = create_model('sos')
# plotting a model
#plot_model(iforest, plot = 'tsne')#, feature = None, label = False, scale = 1, save False, display_format = None
all_models = models()
all_models
iforest_anomoly_label = assign_model(iforest, score = True, verbose= True)
iforest_anomoly_label
confusion_matrix = pd.crosstab(iforest_anomoly_label['Anomaly'], iforest_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
svm = create_model('svm')
# plotting a model
plot_model(svm, plot = 'tsne')
evaluate_model(svm)
evaluate_model(iforest)
svm_anomoly_label = assign_model(svm, score = True, verbose= True)
svm_anomoly_label
iforest_predictions = predict_model(model = svm, data = df_sub_sub)
iforest_predictions
confusion_matrix = pd.crosstab(svm_anomoly_label['Anomaly'], svm_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
# !pip install -U scikit-learn
# !pip install --upgrade sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.svm import OneClassSVM
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from IPython.display import Image as PImage
from subprocess import check_call
from PIL import Image, ImageDraw, ImageFont
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
## X and y defenition for train_test_split
df_sub_sub = df_sub[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
'Sent_to_OpenClick', 'refrence_time_to_OpenClick']]
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
df_sub_sub_x = df_sub_sub[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
'Sent_to_OpenClick']]
## One -hot -encoding
one_hot_data = pd.get_dummies(df_sub_sub_x[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN']], drop_first=True, dtype=float)#
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false',1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around',-1)
X, y = one_hot_data, df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
# # fit on majority class
X_train = X_train[y_train==1]
model.fit(X_train)
# ...
# detect outliers in the test set
yhat = model.predict(X_test)
...
# # mark inliers 1, outliers -1
y_test[y_test == 1] = 1
y_test[y_test == -1] = -1
y_test=y_test.to_numpy()
y_test=y_test.astype(str).astype(int)
# calculate score
score = f1_score(y_test, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
y_test
yhat
yhat.shape
#_test.shape
# isolation forest for imbalanced classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest
# define outlier detection model
model = IsolationForest(contamination=0.4, behaviour='new')
# fit on majority class
model.fit(X_train)
# ...
# detect outliers in the test set
yhat = model.predict(X_test)
...
# # mark inliers 1, outliers -1
y_test[y_test == '1'] = 1
y_test[y_test == '-1'] = -1
# y_test=y_test.to_numpy()
# y_test=y_test.astype(str).astype(int)
# calculate score
score = f1_score(y_test, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 0)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot',1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around',1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_sub_sub,
ignore_low_variance= True , combine_rare_levels= True,
remove_multicollinearity= True, use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
### Train
#(TIP : It's always good to look at the standard deviation of CV results when using create_model()
best_model = compare_models()
df_sub.head()
df_sub['refrence_time']= '2021-01-01 00:00:00'
df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] = pd.to_datetime(df_sub['ME Event Date_FROM_CLICK_OR_OPEN'])
df_sub['refrence_time'] = pd.to_datetime(df_sub['refrence_time'])
df_sub['refrence_time_to_OpenClick']= df_sub['ME Event Date_FROM_CLICK_OR_OPEN'] - df_sub['refrence_time']
df_sub['refrence_time_to_OpenClick']= df_sub['refrence_time_to_OpenClick']/np.timedelta64(1,'s')
df_sub.head()
df_sub_sorted = df_sub.sort_values(by=['ME Action Email', 'SentId', 'ME Event Date_FROM_CLICK_OR_OPEN'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
df_sub_sorted.head()
df_3 = pd.read_csv('add_sent_open_click_time_23534_40.csv')
print(df_3.shape)
df_3.isnull().sum()
df_3.columns
df_3_sub['ME Device_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub = df_3[['SentId','ME Action Email', 'ME Event Type_FROM_CLICK_OR_OPEN',
'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
'ME_Is_Unique_FROM_CLICK_OR_OPEN',
'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN', 'Sent_Time', 'Open_Time', 'Click_Time']]
df_3_sub.head(30)
df_3_sub_sorted = df_3_sub.sort_values(by=['ME Action Email', 'SentId', 'Sent_Time', 'Open_Time'], axis=0, ascending=True, inplace=False, kind='quicksort', na_position='last', ignore_index=False, key=None)
df_3_sub_sorted.head()
df_3_sub_sorted.shape
######### Seperate Click and Open ########
df_3_sub_sorted_Open = df_3_sub_sorted.loc[df_3_sub_sorted['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Open']
df_3_sub_sorted_Click = df_3_sub_sorted.loc[df_3_sub_sorted['ME Event Type_FROM_CLICK_OR_OPEN'].astype(str) == 'Click']
# Change values to datetime, create new columns for dif and change to seconds
df_3_sub_sorted_Open['Sent_Time'] = pd.to_datetime(df_3_sub_sorted_Open['Sent_Time'])
df_3_sub_sorted_Open['Open_Time'] = pd.to_datetime(df_3_sub_sorted_Open['Open_Time'])
df_3_sub_sorted_Open['Click_Time'] = pd.to_datetime(df_3_sub_sorted_Open['Click_Time'])
df_3_sub_sorted_Open['Sent_to_Open'] = df_3_sub_sorted_Open['Open_Time'] - df_3_sub_sorted_Open['Sent_Time']
df_3_sub_sorted_Open['Sent_to_Open'] = df_3_sub_sorted_Open['Sent_to_Open']/np.timedelta64(1,'s')
df_3_sub_sorted_Open['Sent_to_Click'] = df_3_sub_sorted_Open['Click_Time'] - df_3_sub_sorted_Open['Sent_Time']
df_3_sub_sorted_Open['Sent_to_Click'] = df_3_sub_sorted_Open['Sent_to_Click']/np.timedelta64(1,'s')
df_3_sub_sorted_Open['Open_to_Click'] = df_3_sub_sorted_Open['Click_Time'] - df_3_sub_sorted_Open['Open_Time']
df_3_sub_sorted_Open['Open_to_Click'] = df_3_sub_sorted_Open['Open_to_Click']/np.timedelta64(1,'s')
# Change values to datetime, create new columns for dif and change to seconds
df_3_sub_sorted_Click['Sent_Time'] = pd.to_datetime(df_3_sub_sorted_Click['Sent_Time'])
df_3_sub_sorted_Click['Open_Time'] = pd.to_datetime(df_3_sub_sorted_Click['Open_Time'])
df_3_sub_sorted_Click['Click_Time'] = pd.to_datetime(df_3_sub_sorted_Click['Click_Time'])
df_3_sub_sorted_Click['Sent_to_Open'] = df_3_sub_sorted_Click['Open_Time'] - df_3_sub_sorted_Click['Sent_Time']
df_3_sub_sorted_Click['Sent_to_Open'] = df_3_sub_sorted_Click['Sent_to_Open']/np.timedelta64(1,'s')
df_3_sub_sorted_Click['Sent_to_Click'] = df_3_sub_sorted_Click['Click_Time'] - df_3_sub_sorted_Click['Sent_Time']
df_3_sub_sorted_Click['Sent_to_Click'] = df_3_sub_sorted_Click['Sent_to_Click']/np.timedelta64(1,'s')
df_3_sub_sorted_Click['Open_to_Click'] = df_3_sub_sorted_Click['Click_Time'] - df_3_sub_sorted_Click['Open_Time']
df_3_sub_sorted_Click['Open_to_Click'] = df_3_sub_sorted_Click['Open_to_Click']/np.timedelta64(1,'s')
# df_3_sub['is_immidiate_Open']=df_3_sub.loc[df_3_sub['Sent_to_Open'] > 60]
df_3_sub_sorted_Open['is_immidiate_Open']=np.where(df_3_sub_sorted_Open['Sent_to_Open'] < 30, 1, 0)#, df_3_sub['First Season']
df_3_sub_sorted_Open['is_immidiate_Click']=np.where(df_3_sub_sorted_Open['Open_to_Click'] < 3, 1, 0)
df_3_sub_sorted_Click['is_immidiate_Click']=np.where(df_3_sub_sorted_Click['Sent_to_Click'] < 3, 1, 0)
df_3_sub_sorted_Open.tail()
df_3_sub_sorted_Open.shape
df_3_sub_sorted_Open.to_csv('df_3_sub_sorted_Open.csv')
df_3_sub_sorted_Open['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub_sorted_Click.shape
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub_sorted_Open.columns
df_3_sub_sorted_Open_1 = df_3_sub_sorted_Open.iloc[:12000,:]
df_3_sub_sorted_Open_2 = df_3_sub_sorted_Open.iloc[12000:,:] ## Unseen
data_unseen_Open = df_3_sub_sorted_Open_2
df_3_sub_sorted_Open_1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
##### Unseen
df_3_sub_sorted_Open_2['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts() #### Unseen
data = df_3_sub_sorted_Open.sample(frac = 0.95, random_state = 42)
data_unseen_random = df_3_sub_sorted_Open.drop(data.index)
data.reset_index(inplace = True, drop = True)
data_unseen_random.reset_index(inplace = True, drop = True)
data['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
data_unseen_Open['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
data_unseen_random['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_sub_sub.head(2)
df_sub_sub = df_3_sub
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
df_3_sub_sorted_Open
## Unseen
# data = df_3_sub_sorted_Open.sample(frac=0.95, random_state=42)
# data_unseen = df_3_sub_sorted_Open.drop(data.index)
# data.reset_index(inplace=True, drop=True)
# data_unseen.reset_index(inplace=True, drop=True)
import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_sub_sub, ignore_features=['ME Action Email', 'SentId', 'Sent_Time', 'Open_Time', #df_3_sub_sorted_Open_1
'Click_Time', 'ME Event Type_FROM_CLICK_OR_OPEN'
]
#'Sent_to_Open',
# 'is_immidiate_Open', 'is_immidiate_Click',
#'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
#'Open_to_Click','Sent_to_Click''ME_Is_Unique_FROM_CLICK_OR_OPEN',
# 'ME OperatingSystem_FROM_CLICK_OR_OPEN'
, numeric_features=['Sent_to_Click', 'Open_to_Click'],
ignore_low_variance= True , combine_rare_levels= True, data_split_stratify = True,
remove_multicollinearity= True, use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #preprocess=False,,
#fix_imbalance = True,test_data: pandas.DataFrame
#'Sent_to_Open', 'Sent_to_Click', , 'is_immidiate_Open',
#'is_immidiate_Click''Open_to_Click', profile= True,
#high_cardinality_features=['emailclient', 'operatingsystem']
#, transformation= True,feature_selection= True,
best_model_Open = compare_models()
# gbc = create_model('gbc')
# tuned_gbc = tune_model(gbc)
knn = create_model('knn')
print(knn)
plot_model(knn, plot = 'auc')
print(best_model_Open)
#unseen_result = predict_model(best_model_Open, data = data_unseen)
unseen_result = predict_model(best_model_Open, data = data_unseen_Open)
unseen_result.head()
confusion_matrix = pd.crosstab(unseen_result['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], unseen_result['Label'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
# FI imp
plot_model(gbc, plot='feature')
df_3_sub_sorted_Click.shape
df_3_sub_sorted_Click.head()
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub_sorted_Click_NoGrmn['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub_sorted_Click_NoGrmn_1 = df_3_sub_sorted_Click_NoGrmn.iloc[:4000,:]
df_3_sub_sorted_Click_NoGrmn_2 = df_3_sub_sorted_Click_NoGrmn.iloc[4000:,:] ## Unseen
data_unseen_Click = df_3_sub_sorted_Click_NoGrmn_2
df_3_sub_sorted_Click_NoGrmn_1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub_sorted_Click_NoGrmn_2['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub_sorted_Click_NoGrmn.head()
df_sub_sub = df_3_sub
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_3_sub_sorted_Click_NoGrmn = df_3_sub_sorted_Click.loc[df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_3_sub_sorted_Click_NoGrmn = df_3_sub_sorted_Click_NoGrmn.loc[df_3_sub_sorted_Click_NoGrmn['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_Honeypot_around']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
## Unseen
# data_Click = df_3_sub_sorted_Click_NoGrmn.sample(frac=0.95, random_state=42)
# data_unseen_Click = df_3_sub_sorted_Click_NoGrmn.drop(data_Click.index)
# data_Click.reset_index(inplace=True, drop=True)
# data_unseen_Click.reset_index(inplace=True, drop=True)
import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_3_sub_sorted_Click_NoGrmn, ignore_features=['ME Action Email',
'SentId', 'Sent_Time', 'Open_Time',
'Click_Time', 'ME Event Type_FROM_CLICK_OR_OPEN'
], # 'Open_to_Click', 'Sent_to_Open', 'is_immidiate_Click','Sent_to_Click'numeric_features=['Sent_to_Click', 'Open_to_Click'],
ignore_low_variance= True , combine_rare_levels= True, data_split_stratify = True,
remove_multicollinearity= True, use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
best_model_Click = compare_models()
# FI imp
plot_model(best_model_Click, plot='feature')
#unseen_result = predict_model(best_model_Open, data = data_unseen)
unseen_result_Click = predict_model(best_model_Click, data = data_unseen_Click)
unseen_result_Click.head(60)
unseen_result_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
unseen_result_Click['Label'].value_counts()
confusion_matrix = pd.crosstab(unseen_result_Click['Label'], unseen_result_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
#df_sub_sub = df_3_sub
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_3_sub_sorted_Click['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model', 1)
import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_3_sub_sorted_Click, ignore_features=['SentId', 'Sent_Time', 'Open_Time', 'Click_Time', 'ME Event Type_FROM_CLICK_OR_OPEN'], #numeric_features=['Sent_to_Click', 'Open_to_Click'],
ignore_low_variance= True , combine_rare_levels= True,
remove_multicollinearity= True, use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
best_model_Click_2 = compare_models()
#scatterplot
sns.set()
cols = ['Open_to_Click', 'Sent_to_Open', 'is_immidiate_Click',
'Sent_to_Click', 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN']#'BOT_ACTIVITY_x', 'BOT_ACTIVITY_y', 'ME_Is_Unique_x'
sns.pairplot(df_3_sub_sorted_Click_NoGrmn[cols], size = 2.5)
plt.show();
g= sns.pairplot(df_3_sub_sorted_Click_NoGrmn_corr, hue = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN', diag_kind= 'hist',
vars=df_3_sub_sorted_Click_NoGrmn_corr.columns[:-1],
plot_kws=dict(alpha=0.5),
diag_kws=dict(alpha=0.5))
plt.show()
df_3_sub_sorted_Click_NoGrmn_corr=df_3_sub_sorted_Click_NoGrmn[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
'Sent_to_Open', 'Sent_to_Click',
'Open_to_Click']]#'ME_Is_Unique_FROM_CLICK_OR_OPEN',
#'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
#'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
df_3_sub_sorted_Click_NoGrmn_corr.columns[:-1]
df_3_sub = df_3[['SentId','ME Action Email', 'ME Event Type_FROM_CLICK_OR_OPEN',
'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
'ME_Is_Unique_FROM_CLICK_OR_OPEN',
'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN', 'Sent_Time', 'Open_Time', 'Click_Time']]
df_3_sub.head(30)
# Change values to datetime, create new columns for dif and change to seconds
df_3_sub['Sent_Time'] = pd.to_datetime(df_3_sub['Sent_Time'])
df_3_sub['Open_Time'] = pd.to_datetime(df_3_sub['Open_Time'])
df_3_sub['Click_Time'] = pd.to_datetime(df_3_sub['Click_Time'])
df_3_sub['Sent_to_Open'] = df_3_sub['Open_Time'] - df_3_sub['Sent_Time']
df_3_sub['Sent_to_Open'] = df_3_sub['Sent_to_Open']/np.timedelta64(1,'s')
df_3_sub['Sent_to_Click'] = df_3_sub['Click_Time'] - df_3_sub['Sent_Time']
df_3_sub['Sent_to_Click'] = df_3_sub['Sent_to_Click']/np.timedelta64(1,'s')
df_3_sub['Open_to_Click'] = df_3_sub['Click_Time'] - df_3_sub['Open_Time']
df_3_sub['Open_to_Click'] = df_3_sub['Open_to_Click']/np.timedelta64(1,'s')
df_3_sub.head()
df_3_sub['reference_time']= '2021-03-01 00:00:00'
#df_3_sub['ME Event Date_FROM_CLICK_OR_OPEN'] = pd.to_datetime(df_sub['ME Event Date_FROM_CLICK_OR_OPEN'])
df_3_sub['reference_time'] = pd.to_datetime(df_3_sub['reference_time'])
df_3_sub['reference_time_to_Open']= df_3_sub['Open_Time'] - df_3_sub['reference_time']
df_3_sub['reference_time_to_Open']= df_3_sub['reference_time_to_Open']/np.timedelta64(1,'s')
df_3_sub['reference_time_to_Click']= df_3_sub['Click_Time'] - df_3_sub['reference_time']
df_3_sub['reference_time_to_Click']= df_3_sub['reference_time_to_Click']/np.timedelta64(1,'s')
df_3_sub['reference_time_to_Sent']= df_3_sub['Sent_Time'] - df_3_sub['reference_time']
df_3_sub['reference_time_to_Sent']= df_3_sub['reference_time_to_Sent']/np.timedelta64(1,'s')
# df_3_sub['is_immidiate_Open']=df_3_sub.loc[df_3_sub['Sent_to_Open'] > 60]
df_3_sub['is_immidiate_Open']=np.where(df_3_sub['Sent_to_Open'] < 30, 1, 0)#, df_3_sub['First Season']
df_3_sub['is_immidiate_Click']=np.where(df_3_sub['Open_to_Click'] < 3, 1, 0)
df_3_sub.head(3)
df_3_sub = df_3_sub[['ME Action Email',
'BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
'ME_Is_Unique_FROM_CLICK_OR_OPEN',
'ME Browser_FROM_CLICK_OR_OPEN', 'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
'Sent_to_Open', 'Sent_to_Click', 'Open_to_Click','reference_time_to_Open','reference_time_to_Click','reference_time_to_Sent']]#,'is_immidiate_Open''is_immidiate_Click', , 'Sent_Time','Open_Time','Click_Time'
df_3_sub.head()
# d=df_3_sub.dropna()
# d.shape
# d['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
plt.figure(figsize=(8,6))
plt.hist(df_3_sub['Sent_to_Open'], bins=30, alpha=0.55, label="sent_to_Open", color='purple')
plt.hist(df_3_sub['Sent_to_Click'], bins=90, alpha=0.99, label="sent_to_click", color='black')
plt.hist(df_3_sub['Open_to_Click'], bins=90, alpha=0.5, label="open_to_click", color='yellow')
plt.xlabel("time", size=14)
plt.ylabel("Count", size=14)
plt.title("Sent to Open/Click")
plt.legend(loc='upper right')
plt.savefig("overlapping_histograms_with_matplotlib_Python_2.png")
df_3_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
### Just keep honeypot labels - O, 1 ###
df_sub_sub1 = df_3_sub
#[['BOT_ACTIVITY_FROM_CLICK_OR_OPEN',
# 'ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
# 'ME Email Client_FROM_CLICK_OR_OPEN',
# 'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN'
# ]]#,'Sent_to_OpenClick', 'refrence_time_to_OpenClick'
df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub1 = df_sub_sub1.loc[df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 0)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',1)
# df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub1['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',1)
######### Pycaret ###########
#############################
# Importing module and initializing setup
from pycaret.anomaly import *
ano1 = setup(data = df_sub_sub1, session_id=323)# session_id=323===602##, preprocess=True, imputation_type = 'simple',numeric_imputation = 'mean', normalize= True, transformation=True,handle_unknown_categorical = True , remove_multicollinearity=True, high_cardinality_features = True fraction = 0.05,, normalize= True
# creating a model
iforest = create_model('iforest')
# plotting a model
#plot_model(iforest, plot = 'tsne')#, feature = None, label = False, scale = 1, save False, display_format = None
iforest_anomoly_label = assign_model(iforest, score = True, verbose= True)
iforest_anomoly_label
confusion_matrix = pd.crosstab(iforest_anomoly_label['Anomaly'], iforest_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)
# tuned_iforest = tune_model(model = 'iforest', supervised_target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN')
df_3_sub.columns
df_sub_sub.isnull().sum()
# isolation forest for imbalanced classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import IsolationForest
# define outlier detection model
model = IsolationForest(contamination=0.9, behaviour='new')
## X and y defenition for train_test_split
df_sub_sub = df_3_sub
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
# df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
df_sub_sub_x = df_sub_sub[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN',
'Sent_to_Open' ]]#,, 'Open_to_Click''Sent_to_Click' 'reference_time_to_Open', 'reference_time_to_Click','reference_time_to_Sent'
## One -hot -encoding
one_hot_data = pd.get_dummies(df_sub_sub_x[['ME_Is_Unique_FROM_CLICK_OR_OPEN', 'ME Browser_FROM_CLICK_OR_OPEN',
'ME Email Client_FROM_CLICK_OR_OPEN',
'ME OperatingSystem_FROM_CLICK_OR_OPEN', 'ME Device_FROM_CLICK_OR_OPEN']], drop_first=True, dtype=float)#
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false',1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around',-1)
X, y = one_hot_data, df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state=1)
# # fit on majority class
X_train = X_train[y_train==1]
model.fit(X_train)
# ...
# detect outliers in the test set
yhat = model.predict(X_test)
...
# # mark inliers 1, outliers -1
y_test[y_test == 1] = 1
y_test[y_test == -1] = -1
y_test=y_test.to_numpy()
y_test=y_test.astype(str).astype(int)
# calculate score
score = f1_score(y_test, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)
df_sub_sub.
df_3_sub.head(1)
#df_sub_sub = df_3_sub
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('false', -1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot', 1)
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_Honeypot_around', 1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_open_model',-1)
# df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'] = df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].replace('Bot_click_model',-1)
import pycaret
from pycaret.classification import *
exp_clf101 = setup(data = df_3_sub,
ignore_features=['ME Action Email'],
ignore_low_variance= True , combine_rare_levels= True, data_split_stratify = True,
remove_multicollinearity= True, use_gpu= True, target = 'BOT_ACTIVITY_FROM_CLICK_OR_OPEN' , session_id=123) #, profile= True, high_cardinality_features=['emailclient', 'operatingsystem'], transformation= True,feature_selection= True,
df_3_sub.shape
best_model = compare_models()
print(best_model)
rf = create_model('rf')
lgb = create_model('lightgbm')
print(lgb)
dt = create_model('dt')
print(dt)
print(knn)
plot_model(rf, plot = 'auc')
df_sub_sub = df_3_sub
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_open_model']
df_sub_sub = df_sub_sub.loc[df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].astype(str) != 'Bot_click_model']
df_sub_sub.shape
df_sub_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
df_3_sub['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'].value_counts()
svm = create_model('svm')
svm_anomoly_label = assign_model(svm, score = True, verbose= True)
svm_anomoly_label
confusion_matrix = pd.crosstab(svm_anomoly_label['Anomaly'], svm_anomoly_label['BOT_ACTIVITY_FROM_CLICK_OR_OPEN'], rownames=['Actual'], colnames=['Predicted'])
print (confusion_matrix)